import re
import requests
from bs4 import BeautifulSoup
import csv
import os

city_list = []
tourist_list = []
data = []
title_list = []
browser_list = []

with open('./text/全国城市.txt', 'r', encoding='utf-8') as f:
    search = f.read().strip()  # 每个字符作为列表的一项，包括换行符
    search_list = search.split('\n')  # 以换行符作为分隔重新构建列表


# cnt = 0
def hot_city_tourist():
    for search_city in search_list:
        # if cnt < 200:
        #     cnt += 1
        #     continue
        # else:
        #     cnt += 1
        print(search_city)
        search_city = search_city.replace('市', '')
        url = 'https://www.mafengwo.cn/search/q.php?q=' + str(search_city)
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 '
                          'Safari/537.36 Edg/109.0.1518.61 ',
            'Host': 'www.mafengwo.cn',
            'Cookie': '__jsluid_s=d3db61a0e5b7d2f3ebe46fc242b4d50c; mfw_uuid=63ca952a-fc7e-9255-dc7b-fc67323dcc47; '
                      'oad_n=a:3:{s:3:"oid";i:1029;s:2:"dm";s:15:"www.mafengwo.cn";s:2:"ft";s:19:"2023-01-20+21:20:42";}; '
                      '__mfwc=direct; uva=s:92:"a:3:{s:2:"lt";i:1674220844;s:10:"last_refer";s:24:"https://www.mafengwo.cn'
                      '/";s:5:"rhost";N;}";; __mfwurd=a:3:{'
                      's:6:"f_time";i:1674220844;s:9:"f_rdomain";s:15:"www.mafengwo.cn";s:6:"f_host";s:3:"www";}; '
                      '__mfwuuid=63ca952a-fc7e-9255-dc7b-fc67323dcc47; __jsluid_h=3e9a93c05a720da6489c8c6194e80523; '
                      '__omc_chl=; __omc_r=; PHPSESSID=m251pri7l2dbdvqvojatdrse75; __mfwlv=1674614862; __mfwvn=2; '
                      'Hm_lvt_8288b2ed37e5bc9b4c9f7008798d2de0=1674220843,1674614863; bottom_ad_status=0; '
                      '__mfwb=c4818c8a23f3.1.direct; __mfwa=1674220841568.75085.3.1674614862237.1674618443150; '
                      '__mfwlt=1674618443; Hm_lpvt_8288b2ed37e5bc9b4c9f7008798d2de0=1674618444 ',
            'sec-ch-ua': '"Not_A Brand";v="99", "Microsoft Edge";v="109", "Chromium";v="109"',
            'sec-ch-ua-mobile': '?0',
            'sec-ch-ua-platform': '"Windows"',
            'Sec-Fetch-Dest': 'document',
            'Sec-Fetch-Mode': 'navigate',
            'Sec-Fetch-Site': 'none',
            'Sec-Fetch-User': '?1',
            'Upgrade-Insecure-Requests': '1',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,'
                      'application/signed-exchange;v=b3;q=0.9',
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
            'Cache-Control': 'max-age=0',
            'Connection': 'keep-alive'
        }

        page_text = requests.get(url=url, headers=headers).content
        # with open('1.html','wb+') as f:
        #     f.write(page_text)
        soup = BeautifulSoup(page_text, 'lxml')
        p = re.compile(r'>.*?<')
        city = str(soup.select('.mfw-search-main > div > div > .search-mdd-wrap > a > div > .title '))
        tourist = str(soup.select('.mfw-search-main > div > div > .search-mdd-wrap > a > div > .content > b > font'))
        # print(city, " ", tourist)

        # 正则返回的是一个list类型，需要取出其中元素进行操作

        city = p.findall(city)[0]
        tourist = p.findall(tourist)[0]
        print(city, ' ', tourist)

        city_list.append(city.replace('>', '').replace('<', ''))
        tourist_list.append(tourist.replace('>', '').replace('<', ''))

        # if cnt > 336:
        #     break

    for i in range(len(city_list)):
        data.append((city_list[i], tourist_list[i]))

    with open('./text/全国城市热度.txt', 'wb') as f:
        for i in range(len(data)):
            tmp = str(data[i]).replace("[", '').replace("]", '')
            tmp = tmp.replace("'", '').replace('(', '').replace(')', '').replace(' ', '') + '\n'
            f.write(tmp.encode())

    print(data)


def tourist_csv_transform():
    header = ['城市', '去过的人']
    with open('./csv文件/全国城市热度1.csv', 'w+', newline='') as csvfile:
        writer = csv.writer(csvfile, dialect='excel')
        writer.writerow(header)
        with open('./text/全国城市热度.txt', 'r', encoding='utf-8') as filein:
            for line in filein:
                line_list = line.strip('\n').split(',')
                # 需要使用strip()去除每行结尾的\n，因为是按行写入的，本身就会自带换行，split()指定分隔符(会被保留),否则会一个字符作为一个分割
                writer.writerow(line_list)


def hot_city_browser():
    cnt = 0
    for each_city in search_list:
        # if cnt < 318:
        #     cnt += 1
        #     continue

        city = each_city.replace("市", "")
        # print(city)
        url = 'https://www.mafengwo.cn/search/q.php?q=' + city + '&t=notes&seid=&mxid=&mid=&mname=&kt=1'
        headers = {
            'User-Agent': "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) "
                          "Chrome/39.0.2171.95 Safari/537.36",
            'Host': 'www.mafengwo.cn'
        }
        # proxy = {
        #     'http':'101.200.220.107:8080'
        # }
        page_text = requests.get(url, headers=headers).content
        # print(url)
        # print(page_text)
        soup = BeautifulSoup(page_text, 'lxml')
        li_list = soup.select('.mfw-search-main > div > #_j_search_result_left > div > div > ul > li')
        # print(li_list)
        for each_li in li_list:
            title = each_li.h3.a.string  # .string是获取标签中的内容，也可以定位到a标签然后正则
            # print(title)
            title_list.append(title)
            if type(each_li.li.string) != 'NoneType':
                browser = each_li.li.string.replace("\n", '').replace(" ", "").replace("浏览", "")
            else:
                browser = 0
            # print(browser)
            browser_list.append(browser)

        # print(title_list)
        # print(browser_list)

        for i in range(len(title_list)):
            data.append((city, title_list[i], browser_list[i]))

        title_list.clear()
        browser_list.clear()
        print(data)
        with open("./城市游记/" + str(city) + ".txt", "wb") as f:
            for i in data:
                write_in = str(i).replace('(', '').replace(')', '').replace("'", '').replace(" ", '') + '\n'
                f.write(write_in.encode())

        data.clear()
        # if cnt < 324:
        #     cnt += 1
        # else:
        #     break


def browser_csv_transform():
    header = ['城市', '游记', '浏览量']
    cnt = 0
    for city in search_list:
        city = city.replace("市", "")
        with open("./csv文件/各城市游记浏览量/" + city + ".csv", "w+", newline="", encoding="utf-8") as csvfile:
            writer = csv.writer(csvfile, dialect="excel")
            writer.writerow(header)
            with open("./城市游记/" + city + ".txt", "r", encoding="utf-8") as f:
                for line in f:
                    line = line.strip('\n').split(',')
                    cnt += 1
                    writer.writerow(line)
        # if cnt < 3:
        #     cnt += 1
        # else:
        #     break
    print(cnt)

    # 汇总文件
    if os.path.exists("./csv文件/各城市游记浏览量/汇总.csv") is True:
        os.remove("./csv文件/各城市游记浏览量/汇总.csv")
    csv_list = os.listdir("./csv文件/各城市游记浏览量/")
    cnt = 0
    with open("./csv文件/各城市游记浏览量/汇总.csv", "w+", newline="", encoding="utf-8") as csv_f:
        writer = csv.writer(csv_f)
        writer.writerow(["城市", "游记", "浏览量"])
        for i in csv_list:
            # if cnt > 10:
            #     break
            with open("./csv文件/各城市游记浏览量/%s" % i, 'r', encoding="utf-8") as f:
                cnt += 1
                print(cnt)
                for line in f:
                    # print(line)
                    if line[0] == "城":
                        continue
                    else:
                        writer.writerow(line.strip('\n').split(','))


if __name__ == '__main__':
    # hot_city_tourist()
    # tourist_csv_transform()
    # hot_city_browser()
    browser_csv_transform()
